wordbankr


wordbankr

library(wordbankr)
ls("package:wordbankr")
## [1] "fit_aoa"                 "get_administration_data"
## [3] "get_crossling_data"      "get_crossling_items"    
## [5] "get_instrument_data"     "get_instruments"        
## [7] "get_item_data"           "get_sources"            
## [9] "summarise_items"

wordbankr: instruments

intruments <- get_instruments()
intruments

wordbankr: sources

sources <- get_sources()
sources

wordbankr: sources

get_sources(language = "English (American)")

wordbankr: administrations

admins_eng_ws <- get_administration_data(language = "English (American)", form = "WS")
admins_eng_ws
n_distinct(admins_eng_ws$data_id)
## [1] 5520

wordbankr: administrations

admins_eng_ws %>% count(age)

wordbankr: administrations

ggplot(admins_eng_ws, aes(x = age, y = production)) +
  geom_jitter(colour = "grey", size = 0.5) +
  geom_smooth()

wordbankr: administrations

admins_russian <- get_administration_data(language = "Russian")
admins_russian

wordbankr: administrations

admins_ws <- get_administration_data(form = "WS")
admins_ws

wordbankr: administrations

admins <- get_administration_data()
admins
nrow(admins)
## [1] 82055

wordbankr: administrations

admins %>% count(language, form)

wordbankr: items

items_eng_ws <- get_item_data(language = "English (American)", form = "WS")
items_eng_ws

wordbankr: items

items_eng_ws %>% distinct(type)

wordbankr: items

items_eng_ws %>% distinct(category)

wordbankr: items

items_eng_ws %>% distinct(lexical_category)

wordbankr: items

items <- get_item_data()
items

wordbankr: items

items %>% count(language, form)

wordbankr: data

ids <- items_eng_ws %>%
  filter(definition %in% c("dog", "cat")) %>%
  pull(item_id)

get_instrument_data(language = "English (American)", form = "WS", items = ids)

wordbankr: data

get_instrument_data(language = "English (American)", form = "WS",
                    items = ids, administrations = TRUE, iteminfo = TRUE)

wordbankr: data

twos <- admins_eng_ws %>% filter(age == 24)
dog_cat <- items_eng_ws %>% filter(definition %in% c("dog", "cat"))
get_instrument_data(language = "English (American)", form = "WS", items = ids,
                    administrations = twos, iteminfo = dog_cat)

wordbankr

Exercises

  1. Compute and plot median productive vocabulary size (as proportion of total words) over age in each language. Limit to WS data for children 16-30 months old.

  2. For English WS data, compute and plot the proportion of children that produce each word in the “toys” category at each age. Now do the same thing but separately for girls and boys.

wordbankr

Median vocabulary sizes

num_words <- items %>%
  filter(form == "WS", type == "word") %>%
  group_by(language) %>%
  summarise(words = n())
vocab_summary <- admins %>%
  filter(form == "WS", age >= 16, age <= 30) %>%
  left_join(num_words) %>%
  mutate(prop_vocab = production / words) %>%
  group_by(language, age) %>%
  summarise(median_vocab = median(prop_vocab))
ggplot(vocab_summary, aes(x = age, y = median_vocab)) +
  facet_wrap(~language) +
  geom_point() +
  ylim(0, 1) +
  labs(x = "Age (months)", y = "Productive vocabulary size")

wordbankr

Toy trajectories

toys <- items_eng_ws %>%
  filter(type == "word", category == "toys")

toys_data <- get_instrument_data(language = "English (American)", form = "WS",
                                 items = toys$item_id,
                                 administrations = admins_eng_ws,
                                 iteminfo = toys) %>%
  mutate(produces = !is.na(value) & value == "produces")
toys_summary <- toys_data  %>%
  group_by(definition, age) %>%
  summarise(total = n(),
            prop_produces = sum(produces) / total)
ggplot(toys_summary, aes(x = age, y = prop_produces)) +
  facet_wrap(~definition) +
  geom_smooth(aes(weight = total), method = "glm", se = FALSE,
              method.args = list(family = "binomial")) +
  labs(x = "Age (months)", y = "Proportion of children producing")

wordbankr

Toy trajectories by sex

toys_summary_sex <- toys_data %>%
  filter(!is.na(sex)) %>%
  group_by(definition, age, sex) %>%
  summarise(total = n(),
            prop_produces = sum(produces) / total)
ggplot(toys_summary_sex, aes(x = age, y = prop_produces, colour = sex)) +
  facet_wrap(~definition) +
  geom_smooth(aes(weight = total), method = "glm", se = FALSE,
              method.args = list(family = "binomial")) +
  scale_colour_ptol(name = "") +
  labs(x = "Age (months)", y = "Proportion of children producing") +
  theme(legend.position = "top")

wordbankr: AoA

wordbankr: AoA

wordbankr: AoA

wordbankr: AoA

fit_aoa(toys_data) %>% select(definition, aoa)
fit_aoa(toys_data, method = "glmrob", proportion = 0.8) %>% select(definition, aoa)

wordbankr: unilemmas

get_crossling_items()
get_crossling_data(uni_lemmas = "dog")

childesr

childesr

library(childesr)
help(package = "childesr")
ls("package:childesr")
##  [1] "connect_to_childes"     "get_collections"       
##  [3] "get_contexts"           "get_corpora"           
##  [5] "get_database_version"   "get_participants"      
##  [7] "get_speaker_statistics" "get_tokens"            
##  [9] "get_transcripts"        "get_types"             
## [11] "get_utterances"

childesr: collections, corpora

get_collections()
get_corpora()

childesr: transcripts

get_transcripts(collection = "Eng-NA")
get_transcripts(corpus = c("Brown", "Clark"))

childesr: participants

get_participants(corpus = "Clark")
get_participants(collection = "Eng-NA", age = c(24, 36))

childesr: utterances

get_utterances(corpus = "Clark", role = "target_child")
get_utterances(corpus = "Clark", role_exclude = "target_child")

childesr: types

get_types(corpus = "Clark", type = "dog")
get_types(collection = "Eng-NA", role = "target_child", type = "dog")

childesr: tokens

get_tokens(corpus = "Clark", role = "target_child", token = "dog")
get_tokens(corpus = "Clark", role = "target_child", token = "dog", replace = FALSE)

childesr: tokens

get_tokens(corpus = "Clark", role = "target_child", token = c("dog", "cat"))
get_tokens(corpus = "Clark", role = "target_child", token = "dog%")

childesr: tokens

get_tokens(corpus = "Clark", role = "target_child", token = "*", stem = "run")
get_tokens(corpus = "Clark", role = "target_child", token = "*", part_of_speech = "v")

childesr: speaker statistics

brown_stats <- get_speaker_statistics(corpus = "Brown", role = "target_child")
brown_stats

childesr: speaker statistics

ggplot(brown_stats, aes(x = target_child_age, y = mlu_w, colour = target_child_name)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE) +
  scale_colour_ptol(name = "")

childesr

Exercises

  1. For each corpus, plot the number of transcripts in it against its mean length in number of tokens spoken by everyone other than the target child.

  2. Retrieve and plot the number of times each child in the Brown corpus said each inflection of the verb “go” over age.

childesr

Transcript stats

transcripts <- get_transcripts()
corpus_transcripts <- transcripts %>%
  group_by(corpus_id, corpus_name, language) %>%
  summarise(num_transcripts = n())
speaker_stats <- get_speaker_statistics(role_exclude = "target_child")
corpus_tokens <- speaker_stats %>%
  group_by(corpus_id) %>%
  summarise(mean_tokens = mean(num_tokens)) %>%
  left_join(corpus_transcripts)
ggplot(corpus_tokens, aes(x = num_transcripts, y = mean_tokens)) +
  geom_point()

childesr

Transcript stats

childesr

Brown “go” frequencies

go_tokens <- get_tokens(corpus = "Brown", role = "target_child",
                        stem = "go", token = "*")
go_summary <- go_tokens %>%
  mutate(age = cut(target_child_age, 20)) %>%
  group_by(target_child_name, age, gloss) %>%
  summarise(num_tokens = n())
ggplot(go_summary, aes(x = age, y = num_tokens, colour = gloss)) +
  facet_wrap(~target_child_name, scales = "free") +
  geom_line(aes(group = gloss)) +
  scale_colour_ptol()

childesr

Brown “go” frequencies

Mini-project!

Use data from Wordbank and/or childes-db to explore some question about language learning. Here’s a few ideas:

Wordbank
– Explore the relationship between vocabulary size and grammar ability (the items of type complexity).
– Look at the composition of vocabulary – what proportion of words that children know are which lexical category – and how it changes over age.

childes-db
– Characterize the developmental trajectory of children’s lexical diversity (e.g. MTLD) and how it differs by gender.
– Estimate the frequencies of color terms (or some other interesting set of words) in speech to children over age.

Both
– For some set of words, estimate their age of acquisition from Wordbank and frequency in child-directed speech from childes-db and examine the relationship between them.
– Determine which words are earliest-learned according to CDI data and according to corpus data and compare the two.

Resources

Wordbank
wordbank.stanford.edu
github.com/langcog/wordbankr
langcog.github.io/wordbankr
mb-cdi.stanford.edu
Citation: Frank, M. C., Braginsky, M., Yurovsky, D., & Marchman, V. A. (2017). Wordbank: An open repository for developmental vocabulary data. Journal of Child Language, 44(3), 677-694.

childes-db
childes-db.stanford.edu/
github.com/langcog/childesr
childes.talkbank.org
Citation: Sanchez, A., Meylan, S. C., Braginsky, M., MacDonald, K. E., Yurovsky, D., & Frank, M. C. (2019). childes-db: A flexible and reproducible interface to the Child Language Data Exchange System. Behavior Research Methods, 1-14.

This presentation
github.com/mikabr/acq-tools
mikabr.github.io/acq-tools

Contact: mikabr@mit.edu